newdata <- read.csv("new data.csv")
#data transformation: generate a new variable to measure the level of racial diversity
#Create a new dataset(data1) that includes generalized variety
#Have to change the raw number of White, Black, Hispanic to proportion of the whole population.
data1 <- newdata %>% 
  mutate(Black = BlackE/Totalpop) %>% 
  mutate(White = WhiteE/Totalpop) %>% 
  mutate(Hispanic = HispanicE/Totalpop) %>% 
  mutate(Racial_diversity=1-White^2-Black^2-Hispanic^2) %>%
  mutate(norm_Racial_diversity = Racial_diversity*(3/2))
#Try to analysis the data in the single year (2017), which is the same year as the old dateset.
#Create a new dataset that contains only the data in the year of 2017 (data2)
data2 <- filter(data1,Year=="2017")

#Check the Cr plots for test model
fittest <- lm(Voe~Union+Urbanity+Unemployment+GDP_Per_Capita+Totalpop,data = data2)
summary(fittest)
## 
## Call:
## lm(formula = Voe ~ Union + Urbanity + Unemployment + GDP_Per_Capita + 
##     Totalpop, data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -217771  -52693   -7043   40849  659954 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.106e+04  8.970e+04   0.458    0.649    
## Union          -1.304e+04  1.610e+05  -0.081    0.936    
## Urbanity        6.237e+03  1.605e+05   0.039    0.969    
## Unemployment   -1.475e+04  2.028e+04  -0.727    0.471    
## GDP_Per_Capita  8.505e-01  9.558e-01   0.890    0.378    
## Totalpop        1.867e-02  2.517e-03   7.418 2.46e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 124700 on 45 degrees of freedom
## Multiple R-squared:  0.5563, Adjusted R-squared:  0.507 
## F-statistic: 11.28 on 5 and 45 DF,  p-value: 4.352e-07
par(mfrow=c(2,2))
plot(fittest)
## Warning: not plotting observations with leverage one:
##   45

## Warning: not plotting observations with leverage one:
##   45

crPlots(fittest)

#transformation
#fit1 seems to be the best with the least outliners and more valid assumptions
#Still, in the residuals versus leverage plot, there is a outlier that could impact the coefficient.
#Need to check out which state it is
fit1 <- lm(log(Voe)~Union+Urbanity+Unemployment+GDP_Per_Capita+Totalpop,data = data2)
crPlots(fit1)

summary(fit1)
## 
## Call:
## lm(formula = log(Voe) ~ Union + Urbanity + Unemployment + GDP_Per_Capita + 
##     Totalpop, data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8614 -0.6430  0.0000  0.5212  2.0161 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.094e+01  6.951e-01  15.740  < 2e-16 ***
## Union           1.179e+00  1.247e+00   0.945  0.34974    
## Urbanity       -1.188e+00  1.243e+00  -0.956  0.34431    
## Unemployment   -1.082e-01  1.571e-01  -0.689  0.49452    
## GDP_Per_Capita -2.109e-05  7.406e-06  -2.847  0.00662 ** 
## Totalpop        1.482e-07  1.951e-08   7.596 1.34e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9663 on 45 degrees of freedom
## Multiple R-squared:  0.6056, Adjusted R-squared:  0.5618 
## F-statistic: 13.82 on 5 and 45 DF,  p-value: 3.461e-08
par(mfrow=c(2,2))
plot(fit1)
## Warning: not plotting observations with leverage one:
##   45

## Warning: not plotting observations with leverage one:
##   45

fit2 <- lm(log(Voe)~Union+Urbanity+Unemployment+log(GDP_Per_Capita)+log(Totalpop),data=data2)
crPlots(fit2)

summary(fit2)
## 
## Call:
## lm(formula = log(Voe) ~ Union + Urbanity + Unemployment + log(GDP_Per_Capita) + 
##     log(Totalpop), data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.48200 -0.55114 -0.09603  0.61909  1.46037 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4.2372     5.7329  -0.739    0.464    
## Union                 0.9387     1.0327   0.909    0.368    
## Urbanity             -0.9465     1.0292  -0.920    0.363    
## Unemployment         -0.2122     0.1278  -1.661    0.104    
## log(GDP_Per_Capita)  -0.2546     0.4945  -0.515    0.609    
## log(Totalpop)         1.1973     0.1115  10.739 5.32e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7999 on 45 degrees of freedom
## Multiple R-squared:  0.7297, Adjusted R-squared:  0.6997 
## F-statistic:  24.3 on 5 and 45 DF,  p-value: 9.13e-12
par(mfrow=c(2,2))
plot(fit2)
## Warning: not plotting observations with leverage one:
##   45

## Warning: not plotting observations with leverage one:
##   45

#Analyze the data between the year of 2009 to the year of 2017
#Create a new dataset(data3) that excludes year between 2005 and 2008
#fit a linear regression without transformation
data3 <- filter(data1,Year>2008)
fit4 = lm(Voe ~ factor(Year)+Union+Urbanity+Unemployment+GDP_Per_Capita+Totalpop,data = data3)
summary(fit4)
## 
## Call:
## lm(formula = Voe ~ factor(Year) + Union + Urbanity + Unemployment + 
##     GDP_Per_Capita + Totalpop, data = data3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -193257  -41159  -12379   28174  741459 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.080e+05  2.820e+04   3.830 0.000147 ***
## factor(Year)2010 -1.338e+02  1.800e+04  -0.007 0.994071    
## factor(Year)2011 -1.128e+04  1.802e+04  -0.626 0.531749    
## factor(Year)2012 -2.144e+04  1.831e+04  -1.171 0.242042    
## factor(Year)2013 -1.131e+04  1.810e+04  -0.625 0.532245    
## factor(Year)2014 -3.100e+04  1.870e+04  -1.657 0.098137 .  
## factor(Year)2015 -3.832e+04  1.956e+04  -1.959 0.050744 .  
## factor(Year)2016 -5.013e+04  2.090e+04  -2.398 0.016878 *  
## factor(Year)2017 -5.094e+04  2.169e+04  -2.349 0.019269 *  
## Union            -1.972e+03  3.133e+04  -0.063 0.949832    
## Urbanity         -2.937e+03  3.133e+04  -0.094 0.925358    
## Unemployment     -1.240e+04  2.794e+03  -4.437 1.15e-05 ***
## GDP_Per_Capita    4.152e-01  2.166e-01   1.917 0.055877 .  
## Totalpop          1.534e-02  6.606e-04  23.225  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 90780 on 445 degrees of freedom
## Multiple R-squared:  0.5567, Adjusted R-squared:  0.5437 
## F-statistic: 42.98 on 13 and 445 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(fit4)

crPlots(fit4)

#Based on the Component residue plot, it seems all the varibales should take log transformation?
#Or should we choose the same variable as before to take log transformation?
#Should we include gini to the confound variables?
fit5 = lm(log(Voe) ~ factor(Year)+Union+Urbanity+Unemployment+log(GDP_Per_Capita)+log(Totalpop),data = data3)
summary(fit5)
## 
## Call:
## lm(formula = log(Voe) ~ factor(Year) + Union + Urbanity + Unemployment + 
##     log(GDP_Per_Capita) + log(Totalpop), data = data3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4891 -0.5981 -0.1319  0.6285  2.1767 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -2.26484    1.88185  -1.204 0.229417    
## factor(Year)2010     0.04578    0.15960   0.287 0.774380    
## factor(Year)2011    -0.18514    0.15976  -1.159 0.247148    
## factor(Year)2012    -0.13545    0.16245  -0.834 0.404846    
## factor(Year)2013    -0.14405    0.16050  -0.898 0.369935    
## factor(Year)2014    -0.28449    0.16624  -1.711 0.087719 .  
## factor(Year)2015    -0.28997    0.17445  -1.662 0.097173 .  
## factor(Year)2016    -0.33791    0.18719  -1.805 0.071731 .  
## factor(Year)2017    -0.38384    0.19459  -1.973 0.049161 *  
## Union                0.16653    0.27765   0.600 0.548968    
## Urbanity            -0.18483    0.27766  -0.666 0.505983    
## Unemployment        -0.09824    0.02587  -3.797 0.000167 ***
## log(GDP_Per_Capita) -0.32691    0.15789  -2.070 0.038984 *  
## log(Totalpop)        1.12068    0.04054  27.647  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8046 on 445 degrees of freedom
## Multiple R-squared:  0.6682, Adjusted R-squared:  0.6585 
## F-statistic: 68.94 on 13 and 445 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(fit5)

crPlots(fit5)

data4 <- filter(data3, State!="District of Columbia")
fit6 = lm(log(Voe) ~ factor(Year)+Union+Urbanity+Unemployment+log(GDP_Per_Capita)+log(Totalpop),data = data4)
summary(fit6)
## 
## Call:
## lm(formula = log(Voe) ~ factor(Year) + Union + Urbanity + Unemployment + 
##     log(GDP_Per_Capita) + log(Totalpop), data = data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6513 -0.6246 -0.0998  0.5012  2.2473 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -12.279640   2.629621  -4.670 4.02e-06 ***
## factor(Year)2010      0.003009   0.157390   0.019  0.98475    
## factor(Year)2011     -0.178230   0.157410  -1.132  0.25814    
## factor(Year)2012     -0.082247   0.160213  -0.513  0.60796    
## factor(Year)2013     -0.157984   0.158215  -0.999  0.31857    
## factor(Year)2014     -0.211286   0.164241  -1.286  0.19897    
## factor(Year)2015     -0.160555   0.173354  -0.926  0.35487    
## factor(Year)2016     -0.119583   0.187738  -0.637  0.52448    
## factor(Year)2017     -0.127238   0.195956  -0.649  0.51647    
## Union                 0.164985   0.270878   0.609  0.54279    
## Urbanity             -0.203854   0.270924  -0.752  0.45219    
## Unemployment         -0.022057   0.028919  -0.763  0.44605    
## log(GDP_Per_Capita)   0.670670   0.242775   2.763  0.00598 ** 
## log(Totalpop)         1.045557   0.041929  24.936  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7848 on 436 degrees of freedom
## Multiple R-squared:  0.648,  Adjusted R-squared:  0.6375 
## F-statistic: 61.75 on 13 and 436 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(fit6)

crPlots(fit6)

#Add racial diversity
fit7 = lm(log(Voe) ~ factor(Year)+Union+Urbanity+Unemployment+log(GDP_Per_Capita)+log(Totalpop)+norm_Racial_diversity,data = data3)
summary(fit7)
## 
## Call:
## lm(formula = log(Voe) ~ factor(Year) + Union + Urbanity + Unemployment + 
##     log(GDP_Per_Capita) + log(Totalpop) + norm_Racial_diversity, 
##     data = data3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4913 -0.6040 -0.1289  0.6286  2.1776 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -2.19154    2.06347  -1.062 0.288784    
## factor(Year)2010       0.04576    0.15978   0.286 0.774683    
## factor(Year)2011      -0.18563    0.16004  -1.160 0.246718    
## factor(Year)2012      -0.13655    0.16312  -0.837 0.402979    
## factor(Year)2013      -0.14468    0.16084  -0.900 0.368869    
## factor(Year)2014      -0.28616    0.16753  -1.708 0.088307 .  
## factor(Year)2015      -0.29236    0.17679  -1.654 0.098897 .  
## factor(Year)2016      -0.34117    0.19111  -1.785 0.074916 .  
## factor(Year)2017      -0.38751    0.19930  -1.944 0.052487 .  
## Union                  0.16635    0.27797   0.598 0.549858    
## Urbanity              -0.18463    0.27798  -0.664 0.506913    
## Unemployment          -0.09896    0.02721  -3.637 0.000308 ***
## log(GDP_Per_Capita)   -0.33230    0.16974  -1.958 0.050896 .  
## log(Totalpop)          1.11944    0.04301  26.025  < 2e-16 ***
## norm_Racial_diversity  0.01568    0.18006   0.087 0.930652    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8055 on 444 degrees of freedom
## Multiple R-squared:  0.6682, Adjusted R-squared:  0.6578 
## F-statistic: 63.88 on 14 and 444 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(fit7)

crPlots(fit7)

#1 percentage point increase in unionization would increase the vocational training amount to exp(0.16635) -> 1.16635 -> 17% larger?
#Mediation Analysis
model.m <- lm(norm_Racial_diversity~Union+Urbanity+log(Unemployment)+log(GDP_Per_Capita)+log(Totalpop)+factor(Year),data=data3)
model.y <- lm(log(Voe)~Union+norm_Racial_diversity+Urbanity+log(Unemployment)+log(GDP_Per_Capita)+log(Totalpop)+factor(Year),data=data3)
out.1 <- mediate(model.m,model.y,sims=1000,treat = "Union",mediator = "norm_Racial_diversity")
summary(out.1)
## 
## Causal Mediation Analysis 
## 
## Quasi-Bayesian Confidence Intervals
## 
##                Estimate 95% CI Lower 95% CI Upper p-value
## ACME            0.00192     -0.02447         0.04    0.92
## ADE             0.13230     -0.42702         0.67    0.63
## Total Effect    0.13423     -0.42138         0.68    0.63
## Prop. Mediated  0.00133     -0.35837         0.38    0.93
## 
## Sample Size Used: 459 
## 
## 
## Simulations: 1000
plot(out.1)

#In this case, we find the racial diversity does not have a mediation effect.